## Working with Unicode in R -- Rich Nielsen

## I always forget how unicode works in R, and I don't find it very intuitive.
## Here are some examples so that I can remember how to do things because
## most of the online "help" is unhelpful.

## here's some arabic
a <- "\u628\u633\u645 \u0627\uFDF2 \u0627\u0644\u0631\u062D\u0645\u0646 \u0627\u0644\u0631\u062D\u064A\u0645"
a

## It prints nicely in R
Encoding(a)

## If you want to write it out to a text file, first change the encoding to "bytes"
Encoding(a) <- "bytes"
## This will write it out to a text file
writeLines(a ,"out.txt")

## Surprisingly, this won't work. It writes out in ASCII.  Not super helpful.
write(a,"out2.txt")

## Reading in utf-8 text:
b <- readLines("out.txt",encoding = "UTF-8")
b

## sometimes, arabic gets dumped into files and looks like this:
x <- "<U+0627><U+0644><U+0635><U+0641><U+062D><U+0629>"
x
## Not very useful...we'd like to convert x back to unicode
## http://stackoverflow.com/questions/17761858/converting-a-u-escaped-unicode-string-to-ascii
x1 <- paste(paste0("\\u",strsplit(gsub("<|>","",x), "U+",fixed=T)[[1]][-1]), collapse="")
x2 <- parse(text = paste0("'", x1, "'"))
x3 <- x2[[1]]
x3

## Another thing you might want to do is use unicode in urls
## It turns out that this is represented differently and needs to be fed into a url correctly as below:
a <- "\u628\u633\u645 \u0627\uFDF2 \u0627\u0644\u0631\u062D\u0645\u0646 \u0627\u0644\u0631\u062D\u064A\u0645"
a  # the arabic
URLencode(a)  # the arabic in url encoding
## Now we can use this to automate google searches, for example
browseURL(paste("https://www.google.com/webhp?hl=en&lr=&ie=ISO-8859-1&btnG=Search&gws_rd=ssl#hl=en&lr=&q=",URLencode(a), "&btnG=Search", sep=""))
## This can also be done with the RCurl library
library(rCurl)
curlEscape(a)


## Sometimes, Arabic comes in cp1256 encoding.  R can't deal with it

## here is a page in cp1256 encoding
url <- "http://www.almahmood.islamlight.net/index.php?option=content&task=view&id=2609&Itemid=25"
dat <- readLines(url, encoding="bytes")  ## note that we're bringing it in as bytes
dat[500]  ## an example of what the arabic looks like

## the iconv() function will do the conversion
iconv(dat[500], from="CP1256", to="UTF-8")
## you can also do the whole document
iconv(dat, from="CP1256", to="UTF-8")

## Before I found iconv(), I wrote this function below which does the same thing.
## There's no point in using it because it's far slower than iconv(), but the building
## blocks might still be helpful for other problems.

## source the function replacing cp1256 with utf-8 characters
# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT
source("http://www.mit.edu/~rnielsen/convert%20cp1256.txt")
## the function "cp1256ToUTF8()" should now be available
cp1256ToUTF8

## convert just a line from the document
datUtf8 <- cp1256ToUTF8(dat[500])
datUtf8

## convert the whole document
dat <- paste(dat, collapse=" ")
dat <- gsub("<.*?>", " ", dat)
## note that some errors get thrown if the function runs into words it can't parse 
cp1256ToUTF8(dat)


## Sometimes, against your better judgment you use MS excel for some spreadsheet that has Arabic unicode
## If so, bringing it into R is either tricky or impossible, depending on your version of MS excel.
## In general, I find the best results from saving the .xls[x] file as "Unicode" in excel, then opening
## that file in a text editor and saving it as utf-8.  Then, I open it in R with something like:
dat <- read.table("path/to/file.txt", as.is=T, header = T,fill = T, sep="\t",encoding="UTF-8")


